# importing packages we'll need
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
# reading the data set
df = pd.read_csv('loan.csv', dtype = {'issue_d': str}, low_memory=False)
# https://www.kaggle.com/wendykan/lending-club-loan-data
In order to begin to understand our data, let's take a look at the summary statistics for the dollar amounts of these loans.
df.loan_amnt.describe()
# getting summary statistics
count 1.048575e+06 mean 1.538487e+04 std 9.533838e+03 min 1.000000e+03 25% 8.000000e+03 50% 1.300000e+04 75% 2.000000e+04 max 4.000000e+04 Name: loan_amnt, dtype: float64
The first thing to note is that Lending Club facilitated an amazing 1,048,575 loans over this period! Apparently a lot of people are comfortable investing in these loans. It appears that the range of allowable loan sizes is \$1,000-\$40,000. The mean loan size of \$15,385 looks to be representative of a typical loan based on the 25th, 50th, and 75th percentile values. It looks like loan sizes are more common towards the lower end of the acceptable range, but let's check for sure.
# creating a histogram
plt.hist(df.loan_amnt)
plt.title('Loan Amount Distribution')
plt.ylabel('Count')
plt.xlabel('Loan Amount, $')
plt.show()
We were right, there are definitely more loans toward the lower end of the range. In fact, it looks as though loan sizes follow a gamma distribution. Let's construct a q-q plot to see if this is, in fact, a gamma distribution.
q = np.random.gamma(5, 1, 1048575) # generating a random gamma distribution
q10000 = np.random.choice(q, 10000) # taking 10000 instances
q10000.sort() # sorting for q-q plot
loan_amts = list(df.loan_amnt)
loan_amts10000 = np.random.choice(loan_amts, 10000) # taking 10000 instances
loan_amts10000.sort()# sorting for q-q plot
plt.scatter(x=q10000, y=loan_amts10000)
plt.ylabel('Loan Amounts')
plt.xlabel('Random Gamma Variable')
plt.title('Q-Q Plot, Loan Amounts and Gamma Distribution')
plt.show()
It looks like much of the data follows a rough gamma distribution, except the tail is too fat to truly be considered a gamma distribution.
The whole point of investing is to get a return. Are the returns for investors worthwhile?
df.int_rate.describe()
count 1.048575e+06 mean 1.280290e+01 std 4.962779e+00 min 5.310000e+00 25% 9.160000e+00 50% 1.199000e+01 75% 1.549000e+01 max 3.099000e+01 Name: int_rate, dtype: float64
The interest rates on these loans range from 5.31% to an eye-popping 30.99%. It looks like there is potential to make some serious cash here. But wait, what if the rate of 30.99% is an extreme outlier? Let's take a look at the distibution to get a better idea of what to expect.
plt.hist(df.int_rate)
plt.title('Interest Rate Distribution')
plt.ylabel('Count')
plt.xlabel('Interest Rate, %')
plt.show()
Great! Although we ought not expect a 30% return, it looks like double-digit returns are very common. Finally, let's see how long our money would be tied up in one of these loans.
plt.figure(figsize=(10, 5)) #making a subplot
plt.subplot(1, 2, 1)
df.term.value_counts().plot(kind='bar')
plt.title('Loan Terms')
plt.ylabel('Count')
plt.xlabel('Term')
plt.subplot(1, 2, 2)
df.term.value_counts().plot(kind='pie')
plt.title('Loan Terms')
plt.legend()
plt.show()
plt.figure(figsize=(18, 7))
plt.subplot(1, 2, 1)
grades = df.groupby('grade') # grouping the data by loan grade
grades.int_rate.mean().plot(kind='bar')
plt.title('Average Interest Rate by Grade')
plt.ylabel('Average Interest Rate')
plt.xlabel('Grade')
plt.subplot(1, 2, 2)
subgrades = df.groupby('sub_grade') # grouping the data by sub-grade
subgrades.int_rate.mean().plot(kind='bar')
plt.title('Average Interest Rate by Subgrade')
plt.ylabel('Average Interest Rate')
plt.xlabel('Subgrade')
plt.show()
plt.hist(df.all_util, bins=10, range=(0, 100))
plt.title('Total Utilization Rate for All Borrowers')
plt.ylabel('Count')
plt.xlabel('Total Utilization Rate')
plt.show()
plt.hist(df.dti, bins=10, range=(0, 45))
plt.title('DTI Ratio for All Borrowers')
plt.ylabel('Count')
plt.xlabel('Debt to Income Ratio')
plt.show()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1048575 entries, 0 to 1048574 Columns: 145 entries, id to settlement_term dtypes: float64(71), int64(38), object(36) memory usage: 1.1+ GB
We can see that there are 145 columns of data and over 1 million rows, each representing a unique loan. The list of columns is mostly self-explanatory. We can see that 36 of these columns are strings, which means they are likely to be categorical values. The remaining 109 columns are either floats or integers, likely suggesting continuous variables.
df.nunique()
id 0
member_id 0
loan_amnt 1561
funded_amnt 1561
funded_amnt_inv 1580
term 2
int_rate 255
installment 69186
grade 7
sub_grade 35
emp_title 233592
emp_length 11
home_ownership 4
annual_inc 52275
verification_status 3
issue_d 27
loan_status 7
pymnt_plan 2
url 0
desc 24
purpose 13
title 15
zip_code 926
addr_state 50
dti 9676
delinq_2yrs 29
earliest_cr_line 717
inq_last_6mths 6
mths_since_last_delinq 165
mths_since_last_record 128
...
sec_app_open_acc 63
sec_app_revol_util 1164
sec_app_open_act_il 37
sec_app_num_rev_accts 81
sec_app_chargeoff_within_12_mths 21
sec_app_collections_12_mths_ex_med 17
sec_app_mths_since_last_major_derog 136
hardship_flag 2
hardship_type 1
hardship_reason 9
hardship_status 3
deferral_term 1
hardship_amount 5040
hardship_start_date 26
hardship_end_date 27
payment_plan_start_date 26
hardship_length 1
hardship_dpd 34
hardship_loan_status 5
orig_projected_additional_accrued_interest 4011
hardship_payoff_balance_amount 5596
hardship_last_payment_amount 4902
disbursement_method 2
debt_settlement_flag 2
debt_settlement_flag_date 33
settlement_status 3
settlement_date 35
settlement_amount 11100
settlement_percentage 788
settlement_term 34
Length: 145, dtype: int64
It's interesting to note that there are over 1,500 unique loan amounts, but over 69,000 unique interest rates. The grades (7) and subgrades (35) will probably be the most important categorical features in our model. We can also see that certain columns, such as 'url', are likely to be unimportant since they have 1 or 0 unique values.
df.isnull().sum()*100/df.isnull().count()
id 100.000000
member_id 100.000000
loan_amnt 0.000000
funded_amnt 0.000000
funded_amnt_inv 0.000000
term 0.000000
int_rate 0.000000
installment 0.000000
grade 0.000000
sub_grade 0.000000
emp_title 8.625420
emp_length 7.387645
home_ownership 0.000000
annual_inc 0.000000
verification_status 0.000000
issue_d 0.000000
loan_status 0.000000
pymnt_plan 0.000000
url 100.000000
desc 99.997139
purpose 0.000000
title 2.222540
zip_code 0.000000
addr_state 0.000000
dti 0.114155
delinq_2yrs 0.000000
earliest_cr_line 0.000000
inq_last_6mths 0.000095
mths_since_last_delinq 51.381732
mths_since_last_record 84.068092
...
sec_app_open_acc 93.421071
sec_app_revol_util 93.535226
sec_app_open_act_il 93.421071
sec_app_num_rev_accts 93.421071
sec_app_chargeoff_within_12_mths 93.421071
sec_app_collections_12_mths_ex_med 93.421071
sec_app_mths_since_last_major_derog 97.865961
hardship_flag 0.000000
hardship_type 99.465847
hardship_reason 99.465847
hardship_status 99.465847
deferral_term 99.465847
hardship_amount 99.465847
hardship_start_date 99.465847
hardship_end_date 99.465847
payment_plan_start_date 99.465847
hardship_length 99.465847
hardship_dpd 99.465847
hardship_loan_status 99.465847
orig_projected_additional_accrued_interest 99.583816
hardship_payoff_balance_amount 99.465847
hardship_last_payment_amount 99.465847
disbursement_method 0.000000
debt_settlement_flag 0.000000
debt_settlement_flag_date 98.589038
settlement_status 98.589038
settlement_date 98.589038
settlement_amount 98.589038
settlement_percentage 98.589038
settlement_term 98.589038
Length: 145, dtype: float64
Here we can see what percentage of entries for any given column are null. We will use this information in the data cleaning process prior to building any models.
Next, we'll take a look at how the numerical columns correlate with our target, the interest rate.
# filtering for numerical columns only
df_num = df.select_dtypes(exclude=['object'])
df_num.head()
| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | int_rate | installment | annual_inc | url | dti | ... | deferral_term | hardship_amount | hardship_length | hardship_dpd | orig_projected_additional_accrued_interest | hardship_payoff_balance_amount | hardship_last_payment_amount | settlement_amount | settlement_percentage | settlement_term | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | 2500 | 2500 | 2500.0 | 13.56 | 84.92 | 55000.0 | NaN | 18.24 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | NaN | NaN | 30000 | 30000 | 30000.0 | 18.94 | 777.23 | 90000.0 | NaN | 26.52 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | 5000 | 5000 | 5000.0 | 17.97 | 180.69 | 59280.0 | NaN | 10.51 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | 4000 | 4000 | 4000.0 | 18.94 | 146.51 | 92000.0 | NaN | 16.74 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | 30000 | 30000 | 30000.0 | 16.14 | 731.78 | 57250.0 | NaN | 26.35 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 109 columns
corr_list = []
for i in col_names:
corr_list.append(df_num[i].corr(df_num['int_rate']))
new_df = pd.DataFrame()
new_df['term'] = col_names
new_df['corr'] = corr_list
new_df.head()
| term | corr | |
|---|---|---|
| 0 | id | NaN |
| 1 | member_id | NaN |
| 2 | loan_amnt | 0.066972 |
| 3 | funded_amnt | 0.066974 |
| 4 | funded_amnt_inv | 0.066947 |
new_df.sort_values(by=['corr'], ascending=False)
| term | corr | |
|---|---|---|
| 5 | int_rate | 1.000000 |
| 100 | hardship_amount | 0.636597 |
| 103 | orig_projected_additional_accrued_interest | 0.627249 |
| 24 | total_rec_int | 0.373019 |
| 106 | settlement_amount | 0.337066 |
| 104 | hardship_payoff_balance_amount | 0.294277 |
| 47 | all_util | 0.294212 |
| 55 | bc_util | 0.265513 |
| 93 | sec_app_revol_util | 0.261570 |
| 82 | percent_bc_gt_75 | 0.260068 |
| 17 | revol_util | 0.247216 |
| 33 | dti_joint | 0.226296 |
| 105 | hardship_last_payment_amount | 0.202354 |
| 80 | num_tl_op_past_12m | 0.174312 |
| 52 | acc_open_past_24mths | 0.169056 |
| 11 | inq_last_6mths | 0.162740 |
| 51 | inq_last_12m | 0.162654 |
| 39 | open_il_12m | 0.162587 |
| 108 | settlement_term | 0.161607 |
| 40 | open_il_24m | 0.154905 |
| 43 | il_util | 0.143707 |
| 37 | open_acc_6m | 0.136144 |
| 49 | inq_fi | 0.133217 |
| 9 | dti | 0.130854 |
| 26 | recoveries | 0.123257 |
| 27 | collection_recovery_fee | 0.121402 |
| 45 | open_rv_24m | 0.118012 |
| 44 | open_rv_12m | 0.113346 |
| 6 | installment | 0.101591 |
| 90 | sec_app_inq_last_6mths | 0.093935 |
| ... | ... | ... |
| 23 | total_rec_prncp | -0.061247 |
| 46 | max_bal_bc | -0.066331 |
| 70 | num_bc_sats | -0.069052 |
| 74 | num_rev_accts | -0.070540 |
| 63 | mths_since_recent_bc | -0.071428 |
| 7 | annual_inc | -0.073365 |
| 95 | sec_app_num_rev_accts | -0.078475 |
| 53 | avg_cur_bal | -0.079970 |
| 98 | sec_app_mths_since_last_major_derog | -0.080181 |
| 36 | tot_cur_bal | -0.081759 |
| 60 | mo_sin_rcnt_rev_tl_op | -0.081796 |
| 81 | pct_tl_nvr_dlq | -0.085398 |
| 41 | mths_since_rcnt_il | -0.095490 |
| 71 | num_bc_tl | -0.102206 |
| 61 | mo_sin_rcnt_tl | -0.105942 |
| 62 | mort_acc | -0.109449 |
| 32 | annual_inc_joint | -0.120149 |
| 85 | tot_hi_cred_lim | -0.120977 |
| 65 | mths_since_recent_inq | -0.129131 |
| 91 | sec_app_mort_acc | -0.131981 |
| 59 | mo_sin_old_rev_tl_op | -0.138701 |
| 48 | total_rev_hi_lim | -0.196899 |
| 87 | total_bc_limit | -0.240657 |
| 54 | bc_open_to_buy | -0.294459 |
| 0 | id | NaN |
| 1 | member_id | NaN |
| 8 | url | NaN |
| 31 | policy_code | NaN |
| 99 | deferral_term | NaN |
| 101 | hardship_length | NaN |
109 rows × 2 columns
Now this list will help inform our decisions about which columns/features are likely to be useful in predicting interest rate.
Finally, let's take a look at the list of columns with data type string so that we can get a good idea of what categorical values will be available as potential features.
df_obj = df.select_dtypes(include=['object'])
list(df_obj)
['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'initial_list_status', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'application_type', 'verification_status_joint', 'sec_app_earliest_cr_line', 'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_loan_status', 'disbursement_method', 'debt_settlement_flag', 'debt_settlement_flag_date', 'settlement_status', 'settlement_date']
# creating the manual feature list
manual_feat_list = ['int_rate', 'all_util', 'percent_bc_gt_75', 'dti_joint', 'num_tl_op_past_12m', 'inq_last_6mths', 'open_il_12m', 'bc_open_to_buy', 'total_bc_limit', 'mo_sin_old_rev_tl_op', 'sec_app_mort_acc', 'annual_inc_joint', 'pub_rec_bankruptcies', 'term', 'sub_grade', 'home_ownership']
# creating a dataframe with only the selected manual features
man_df = pd.DataFrame()
man_df = df.filter(items=manual_feat_list)
man_df.head()
| int_rate | all_util | percent_bc_gt_75 | dti_joint | num_tl_op_past_12m | inq_last_6mths | open_il_12m | bc_open_to_buy | total_bc_limit | mo_sin_old_rev_tl_op | sec_app_mort_acc | annual_inc_joint | pub_rec_bankruptcies | term | sub_grade | home_ownership | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13.56 | 28.0 | 0.0 | NaN | 3 | 1.0 | 1.0 | 34360.0 | 36500 | 212 | NaN | NaN | 1 | 36 months | C1 | RENT |
| 1 | 18.94 | 57.0 | 0.0 | NaN | 6 | 0.0 | 2.0 | 13761.0 | 15000 | 378 | NaN | NaN | 1 | 60 months | D2 | MORTGAGE |
| 2 | 17.97 | 35.0 | 0.0 | NaN | 0 | 0.0 | 0.0 | 13800.0 | 13800 | 92 | NaN | NaN | 0 | 36 months | D1 | MORTGAGE |
| 3 | 18.94 | 70.0 | 100.0 | NaN | 3 | 0.0 | 3.0 | 1239.0 | 5000 | 154 | NaN | NaN | 0 | 36 months | D2 | MORTGAGE |
| 4 | 16.14 | 54.0 | 0.0 | NaN | 5 | 0.0 | 3.0 | 8471.0 | 9300 | 216 | NaN | NaN | 0 | 60 months | C4 | MORTGAGE |
Now we will get dummy values for the categorical features in the manual feature list.
man_df = pd.concat([man_df, pd.get_dummies(man_df['term'], drop_first=True)], axis=1)
# dropping the string columns now that we've gotten dummies
man_df = man_df.drop('sub_grade', 1)
man_df = man_df.drop('home_ownership', 1)
man_df = man_df.drop('term', 1)
man_df.head()
| int_rate | all_util | percent_bc_gt_75 | dti_joint | num_tl_op_past_12m | inq_last_6mths | open_il_12m | bc_open_to_buy | total_bc_limit | mo_sin_old_rev_tl_op | ... | F1 | F2 | F3 | F4 | F5 | G1 | G2 | G3 | G4 | G5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13.56 | 28.0 | 0.0 | NaN | 3 | 1.0 | 1.0 | 34360.0 | 36500 | 212 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 18.94 | 57.0 | 0.0 | NaN | 6 | 0.0 | 2.0 | 13761.0 | 15000 | 378 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 17.97 | 35.0 | 0.0 | NaN | 0 | 0.0 | 0.0 | 13800.0 | 13800 | 92 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 18.94 | 70.0 | 100.0 | NaN | 3 | 0.0 | 3.0 | 1239.0 | 5000 | 154 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 16.14 | 54.0 | 0.0 | NaN | 5 | 0.0 | 3.0 | 8471.0 | 9300 | 216 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 51 columns
man_df.fillna(0, inplace=True)
Finally, we replace all NaN values with zeroes in the manual features dataframe. Our manual features are now ready to feed into a model.
trim_df = man_df.sample(10000, random_state=42)
trim_df.head(10)
| int_rate | all_util | percent_bc_gt_75 | dti_joint | num_tl_op_past_12m | inq_last_6mths | open_il_12m | bc_open_to_buy | total_bc_limit | mo_sin_old_rev_tl_op | ... | F1 | F2 | F3 | F4 | F5 | G1 | G2 | G3 | G4 | G5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 781974 | 11.47 | 51.0 | 33.3 | 0.0 | 2 | 0.0 | 0.0 | 15090.0 | 31300 | 241 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 937737 | 13.67 | 80.0 | 0.0 | 0.0 | 2 | 1.0 | 2.0 | 1770.0 | 2700 | 346 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 907828 | 15.99 | 87.0 | 90.0 | 0.0 | 2 | 0.0 | 0.0 | 2097.0 | 53300 | 251 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 784628 | 11.47 | 51.0 | 42.9 | 0.0 | 2 | 0.0 | 0.0 | 8868.0 | 24600 | 235 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 662460 | 15.31 | 94.0 | 100.0 | 0.0 | 3 | 0.0 | 1.0 | 1100.0 | 24100 | 214 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 280139 | 10.07 | 60.0 | 28.6 | 0.0 | 0 | 0.0 | 0.0 | 7155.0 | 17700 | 121 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 355572 | 6.07 | 53.0 | 25.0 | 0.0 | 3 | 0.0 | 0.0 | 15777.0 | 33900 | 125 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 749979 | 19.53 | 56.0 | 100.0 | 0.0 | 6 | 3.0 | 1.0 | 367.0 | 6800 | 125 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 374753 | 9.43 | 33.0 | 0.0 | 0.0 | 1 | 0.0 | 0.0 | 9396.0 | 9800 | 158 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 17327 | 7.02 | 64.0 | 66.7 | 0.0 | 0 | 0.0 | 0.0 | 5930.0 | 32900 | 265 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 rows × 51 columns
trim_df2 = trim_df.copy()
We'll begin with Mean Shift as our first clustering technique. We will use SK Learn's estimate_bandwidth function and see how many clusters are identified "out of the box."
from sklearn.cluster import MeanShift, estimate_bandwidth
# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
my_bandwidth = estimate_bandwidth(trim_df)
my_bandwidth
26438.79205724502
# Declare and fit the model.
ms = MeanShift(bandwidth=my_bandwidth, bin_seeding=True)
ms.fit(trim_df)
# Extract cluster assignments for each data point.
labels = ms.labels_
# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_
# Count our clusters.
n_clusters_ = len(np.unique(labels))
print("Number of estimated clusters: {}".format(n_clusters_))
Number of estimated clusters: 34
trim_df['cluster'] = labels
trim_df.cluster.value_counts()
0 8990 1 509 13 125 6 120 3 74 2 44 4 33 8 30 7 20 9 10 26 7 5 6 29 3 10 2 17 2 15 2 11 2 14 2 16 2 12 2 20 2 25 1 24 1 32 1 31 1 33 1 18 1 23 1 27 1 28 1 21 1 22 1 30 1 19 1 Name: cluster, dtype: int64
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='all_util', hue='cluster', data=trim_df, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Utilization Rate')
plt.title('Interest Rate vs Utilization Rate, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='dti_joint', hue='cluster', data=trim_df, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Debt to Income Ratio')
plt.title('Interest Rate vs DTI Ratio, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='bc_open_to_buy', hue='cluster', data=trim_df, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Bank Card Available Credit')
plt.title('Interest Rate vs Available Bank Card Credit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='mo_sin_old_rev_tl_op', hue='cluster', data=trim_df, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Months Since Oldest Credit Line Opened')
plt.title('Interest Rate vs Months Since Oldest Credit Line Opened, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='total_bc_limit', hue='cluster', data=trim_df, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Total Bank Card Limit')
plt.title('Interest Rate vs Total Bank Card Limit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='annual_inc_joint', x='bc_open_to_buy', hue='cluster', data=trim_df, palette='Spectral')
plt.ylabel('Annual Income')
plt.xlabel('Bank Card Available Credit')
plt.title('Annual Income vs Available Bank Card Credit, Colored by Cluster')
plt.show()
34 clusters seems like a lot for this data set, since several clusters have only 1-3 data points. Also, from our diagnostic plots, it looks like the clustering algorithm is keying off of available credit metrics, but not much else is clear.
As our final piece of analysis for this clustering technique, we will take a random sample of the data a re-run the clustering algorithm. We'll then feed the cluster assignments from each run into the Adjusted Rand Index function to get an idea of how stable this particular technique is.
xtra_trim_df = trim_df.sample(2000, random_state=42)
temp_list = xtra_trim_df['cluster']
xtra_trim_df = xtra_trim_df.drop('cluster', 1)
predict = ms.fit_predict(xtra_trim_df)
from sklearn import metrics
metrics.adjusted_rand_score(temp_list, predict)
0.904035049295049
xtra_trim_df = trim_df.sample(2000, random_state=20)
temp_list = xtra_trim_df['cluster']
xtra_trim_df = xtra_trim_df.drop('cluster', 1)
predict = ms.fit_predict(xtra_trim_df)
from sklearn import metrics
metrics.adjusted_rand_score(temp_list, predict)
0.8892519566581502
xtra_trim_df = trim_df.sample(2000, random_state=2)
temp_list = xtra_trim_df['cluster']
xtra_trim_df = xtra_trim_df.drop('cluster', 1)
predict = ms.fit_predict(xtra_trim_df)
from sklearn import metrics
metrics.adjusted_rand_score(temp_list, predict)
0.8191570292783413
The average ARI is 0.871 and the range of outcomes is 0.085. We'll have to do the same analysis on several other clustering techniques to see how this technique compares.
Since I felt like the first iteration of Mean Shift identified too many clusters, this time we will manually set the bandwidth and come up with only 5 clusters.
ms0 = MeanShift(bandwidth=75000, bin_seeding=True)
ms0.fit(trim_df0)
# Extract cluster assignments for each data point.
labels0 = ms0.labels_
# Coordinates of the cluster centers.
cluster_centers0 = ms0.cluster_centers_
# Count our clusters.
n_clusters_0 = len(np.unique(labels0))
print("Number of estimated clusters: {}".format(n_clusters_0))
Number of estimated clusters: 5
trim_df0.cluster.value_counts()
0 9272 1 679 3 29 2 17 4 3 Name: cluster, dtype: int64
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='all_util', hue='cluster', data=trim_df0, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Utilization Rate')
plt.title('Interest Rate vs Utilization Rate, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='dti_joint', hue='cluster', data=trim_df0, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Debt to Income Ratio')
plt.title('Interest Rate vs DTI Ratio, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='bc_open_to_buy', hue='cluster', data=trim_df0, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Bank Card Available Credit')
plt.title('Interest Rate vs Available Bank Card Credit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='mo_sin_old_rev_tl_op', hue='cluster', data=trim_df0, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Months Since Oldest Credit Line Opened')
plt.title('Interest Rate vs Months Since Oldest Credit Line Opened, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='total_bc_limit', hue='cluster', data=trim_df0, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Total Bank Card Limit')
plt.title('Interest Rate vs Total Bank Card Limit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='annual_inc_joint', x='bc_open_to_buy', hue='cluster', data=trim_df0, palette='Spectral')
plt.ylabel('Annual Income')
plt.xlabel('Bank Card Available Credit')
plt.title('Annual Income vs Available Bank Card Credit, Colored by Cluster')
plt.show()
From the diagnostic plots it looks like people with little or no debt and with relatively lower incomes are being assigned to cluster 0. Annual income, debt to income ratio, and available credit all appear to have strong predictive power for cluster assignment. There is much more observable differentiation amongst clusters in this iteration than in the previous.
xtra_trim_df0 = trim_df0.sample(2000, random_state=42)
temp_list0 = xtra_trim_df0['cluster']
xtra_trim_df0 = xtra_trim_df0.drop('cluster', 1)
predict0 = ms0.fit_predict(xtra_trim_df0)
metrics.adjusted_rand_score(temp_list0, predict0)
0.9091481368995394
xtra_trim_df0 = trim_df0.sample(2000, random_state=20)
temp_list0 = xtra_trim_df0['cluster']
xtra_trim_df0 = xtra_trim_df0.drop('cluster', 1)
predict0 = ms0.fit_predict(xtra_trim_df0)
metrics.adjusted_rand_score(temp_list0, predict0)
0.9657549572506299
xtra_trim_df0 = trim_df0.sample(2000, random_state=2)
temp_list0 = xtra_trim_df0['cluster']
xtra_trim_df0 = xtra_trim_df0.drop('cluster', 1)
predict0 = ms0.fit_predict(xtra_trim_df0)
metrics.adjusted_rand_score(temp_list0, predict0)
0.9136727484879532
The average ARI is 0.930 and the range is 0.057. This iteration seems to produce a more stable cluster assignment than the first iteration.
Now we'll take a look at K-Mean clustering.
Since we are forced to pick the number of clusters with K-Means and since 5 clusters performed best in Mean-Shift, we will start with K=5.
from sklearn.cluster import KMeans
km2 = KMeans(n_clusters=5, random_state=42).fit(trim_df2)
km2_labels = km2.labels_
km2_cluster_centers = km2.cluster_centers_
trim_df2.cluster.value_counts()
3 6603 0 2281 2 533 4 409 1 174 Name: cluster, dtype: int64
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='all_util', hue='cluster', data=trim_df2, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Utilization Rate')
plt.title('Interest Rate vs Utilization Rate, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='dti_joint', hue='cluster', data=trim_df2, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Debt to Income Ratio')
plt.title('Interest Rate vs DTI Ratio, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='bc_open_to_buy', hue='cluster', data=trim_df2, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Bank Card Available Credit')
plt.title('Interest Rate vs Available Bank Card Credit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='mo_sin_old_rev_tl_op', hue='cluster', data=trim_df2, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Months Since Oldest Credit Line Opened')
plt.title('Interest Rate vs Months Since Oldest Credit Line Opened, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='total_bc_limit', hue='cluster', data=trim_df2, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Total Bank Card Limit')
plt.title('Interest Rate vs Total Bank Card Limit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='annual_inc_joint', x='bc_open_to_buy', hue='cluster', data=trim_df2, palette='Spectral')
plt.ylabel('Annual Income')
plt.xlabel('Bank Card Available Credit')
plt.title('Annual Income vs Available Bank Card Credit, Colored by Cluster')
plt.show()
Available credit and annual income seem to be important variables for this clustering assignment, as we see clear differentiation when those variables are plotted.
xtra_trim_df2 = trim_df2.sample(2000, random_state=42)
temp_list2 = xtra_trim_df2['cluster']
xtra_trim_df2 = xtra_trim_df2.drop('cluster', 1)
predict2 = km2.fit_predict(xtra_trim_df2)
metrics.adjusted_rand_score(temp_list2, predict2)
0.9695802540666566
xtra_trim_df2 = trim_df2.sample(2000, random_state=20)
temp_list2 = xtra_trim_df2['cluster']
xtra_trim_df2 = xtra_trim_df2.drop('cluster', 1)
predict2 = km2.fit_predict(xtra_trim_df2)
metrics.adjusted_rand_score(temp_list2, predict2)
0.9883820510764837
xtra_trim_df2 = trim_df2.sample(2000, random_state=2)
temp_list2 = xtra_trim_df2['cluster']
xtra_trim_df2 = xtra_trim_df2.drop('cluster', 1)
predict2 = km2.fit_predict(xtra_trim_df2)
metrics.adjusted_rand_score(temp_list2, predict2)
0.9713637426565112
The average ARI is 0.976 and the range is 0.018. This is clearly the best performance so far.
Now let's increase the number of clusters and see how performance changes.
from sklearn.cluster import KMeans
km3 = KMeans(n_clusters=12, random_state=42).fit(trim_df3)
km3_labels = km3.labels_
km3_cluster_centers = km3.cluster_centers_
trim_df3.cluster.value_counts()
0 4182 9 2648 3 1325 8 526 5 358 10 285 1 254 2 208 4 115 7 53 11 25 6 21 Name: cluster, dtype: int64
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='all_util', hue='cluster', data=trim_df3, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Utilization Rate')
plt.title('Interest Rate vs Utilization Rate, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='dti_joint', hue='cluster', data=trim_df3, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Debt to Income Ratio')
plt.title('Interest Rate vs DTI Ratio, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='bc_open_to_buy', hue='cluster', data=trim_df3, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Bank Card Available Credit')
plt.title('Interest Rate vs Available Bank Card Credit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='mo_sin_old_rev_tl_op', hue='cluster', data=trim_df3, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Months Since Oldest Credit Line Opened')
plt.title('Interest Rate vs Months Since Oldest Credit Line Opened, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='total_bc_limit', hue='cluster', data=trim_df3, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Total Bank Card Limit')
plt.title('Interest Rate vs Total Bank Card Limit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='annual_inc_joint', x='bc_open_to_buy', hue='cluster', data=trim_df3, palette='Spectral')
plt.ylabel('Annual Income')
plt.xlabel('Bank Card Available Credit')
plt.title('Annual Income vs Available Bank Card Credit, Colored by Cluster')
plt.show()
Once again, available credit and income seem to be the variables driving cluster assignments.
xtra_trim_df3 = trim_df3.sample(2000, random_state=42)
temp_list3 = xtra_trim_df3['cluster']
xtra_trim_df3 = xtra_trim_df3.drop('cluster', 1)
predict3 = km3.fit_predict(xtra_trim_df3)
metrics.adjusted_rand_score(temp_list3, predict3)
0.9162778780804587
xtra_trim_df3 = trim_df3.sample(2000, random_state=20)
temp_list3 = xtra_trim_df3['cluster']
xtra_trim_df3 = xtra_trim_df3.drop('cluster', 1)
predict3 = km3.fit_predict(xtra_trim_df3)
metrics.adjusted_rand_score(temp_list3, predict3)
0.9029953419740511
xtra_trim_df3 = trim_df3.sample(2000, random_state=2)
temp_list3 = xtra_trim_df3['cluster']
xtra_trim_df3 = xtra_trim_df3.drop('cluster', 1)
predict3 = km3.fit_predict(xtra_trim_df3)
metrics.adjusted_rand_score(temp_list3, predict3)
0.7956839125462816
The average ARI is 0.872 and the range is 0.120. Setting K=12 is clearly less stable than K=5.
from sklearn.cluster import KMeans
km4 = KMeans(n_clusters=20, random_state=42).fit(trim_df4)
km4_labels = km4.labels_
km4_cluster_centers = km4.cluster_centers_
trim_df4.cluster.value_counts()
0 2861 14 2332 6 1125 9 1114 15 599 1 472 7 329 13 234 16 175 5 159 2 158 18 150 8 94 4 78 11 54 3 17 10 15 17 15 19 11 12 8 Name: cluster, dtype: int64
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='all_util', hue='cluster', data=trim_df4, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Utilization Rate')
plt.title('Interest Rate vs Utilization Rate, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='dti_joint', hue='cluster', data=trim_df4, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Debt to Income Ratio')
plt.title('Interest Rate vs DTI Ratio, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='bc_open_to_buy', hue='cluster', data=trim_df4, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Bank Card Available Credit')
plt.title('Interest Rate vs Available Bank Card Credit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='mo_sin_old_rev_tl_op', hue='cluster', data=trim_df4, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Months Since Oldest Credit Line Opened')
plt.title('Interest Rate vs Months Since Oldest Credit Line Opened, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='int_rate', x='total_bc_limit', hue='cluster', data=trim_df4, palette='Spectral')
plt.ylabel('Interest Rate')
plt.xlabel('Total Bank Card Limit')
plt.title('Interest Rate vs Total Bank Card Limit, Colored by Cluster')
plt.show()
plt.figure(figsize = (15,12))
sns.scatterplot(y='annual_inc_joint', x='bc_open_to_buy', hue='cluster', data=trim_df4, palette='Spectral')
plt.ylabel('Annual Income')
plt.xlabel('Bank Card Available Credit')
plt.title('Annual Income vs Available Bank Card Credit, Colored by Cluster')
plt.show()
As with most all the clustering techniques we've evaluated, available credit and income are the variables driving cluster assignment, although the dividing lines between clusters along these axes is less clear than in some of the earlier iterations.
xtra_trim_df4 = trim_df4.sample(2000, random_state=42)
temp_list4 = xtra_trim_df4['cluster']
xtra_trim_df4 = xtra_trim_df4.drop('cluster', 1)
predict4 = km4.fit_predict(xtra_trim_df4)
metrics.adjusted_rand_score(temp_list4, predict4)
0.8950973232295207
xtra_trim_df4 = trim_df4.sample(2000, random_state=20)
temp_list4 = xtra_trim_df4['cluster']
xtra_trim_df4 = xtra_trim_df4.drop('cluster', 1)
predict4 = km4.fit_predict(xtra_trim_df4)
metrics.adjusted_rand_score(temp_list4, predict4)
0.690443218273136
xtra_trim_df4 = trim_df4.sample(2000, random_state=2)
temp_list4 = xtra_trim_df4['cluster']
xtra_trim_df4 = xtra_trim_df4.drop('cluster', 1)
predict4 = km4.fit_predict(xtra_trim_df4)
metrics.adjusted_rand_score(temp_list4, predict4)
0.8208139117017047
The average ARI is 0.802 and the range is 0.205. This is definitely the weakest K-Means option evaluated.
From both the quantitative (Adjusted Rand Index scores) and qualitative (diagnostic plots) perspectives, employing a K-Means clustering algorithm with 5 clusters is the best option evaluated. Also, from the diagnostic plots we can tell that available credit and annual income of the borrower are the variables that really drive clustering. This tells us that if we want to further analyze or make some decisions about these loans that we should pay particular attention to these variables.
That being said, I feel that clustering is much less useful on this data set than modeling (i.e. regression modeling). With all of this work, all we've really learned is that available credit and annual income are probably the most important variables for differentiating one loan from another. However, with a regression model we could predict performance of loans or the expected interest rate for a given borrower profile and use this information to make investment decisions. In short, clustering helps us learn more about this data set, but only modeling can help us make decisions based on the data.